import pandas as pd
# load data
data = pd.read_csv('./data/diabetes.csv')
pd.set_option('display.width',150)
pd.set_option('precision',2)
data.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 | 768.00 |
| mean | 3.85 | 120.89 | 69.11 | 20.54 | 79.80 | 31.99 | 0.47 | 33.24 | 0.35 |
| std | 3.37 | 31.97 | 19.36 | 15.95 | 115.24 | 7.88 | 0.33 | 11.76 | 0.48 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.08 | 21.00 | 0.00 |
| 25% | 1.00 | 99.00 | 62.00 | 0.00 | 0.00 | 27.30 | 0.24 | 24.00 | 0.00 |
| 50% | 3.00 | 117.00 | 72.00 | 23.00 | 30.50 | 32.00 | 0.37 | 29.00 | 0.00 |
| 75% | 6.00 | 140.25 | 80.00 | 32.00 | 127.25 | 36.60 | 0.63 | 41.00 | 1.00 |
| max | 17.00 | 199.00 | 122.00 | 99.00 | 846.00 | 67.10 | 2.42 | 81.00 | 1.00 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
data.groupby('Outcome').size()
Outcome 0 500 1 268 dtype: int64
data.corr()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| Pregnancies | 1.000 | 0.129 | 0.141 | -0.082 | -0.074 | 0.018 | -0.034 | 0.544 | 0.222 |
| Glucose | 0.129 | 1.000 | 0.153 | 0.057 | 0.331 | 0.221 | 0.137 | 0.264 | 0.467 |
| BloodPressure | 0.141 | 0.153 | 1.000 | 0.207 | 0.089 | 0.282 | 0.041 | 0.240 | 0.065 |
| SkinThickness | -0.082 | 0.057 | 0.207 | 1.000 | 0.437 | 0.393 | 0.184 | -0.114 | 0.075 |
| Insulin | -0.074 | 0.331 | 0.089 | 0.437 | 1.000 | 0.198 | 0.185 | -0.042 | 0.131 |
| BMI | 0.018 | 0.221 | 0.282 | 0.393 | 0.198 | 1.000 | 0.141 | 0.036 | 0.293 |
| DiabetesPedigreeFunction | -0.034 | 0.137 | 0.041 | 0.184 | 0.185 | 0.141 | 1.000 | 0.034 | 0.174 |
| Age | 0.544 | 0.264 | 0.240 | -0.114 | -0.042 | 0.036 | 0.034 | 1.000 | 0.238 |
| Outcome | 0.222 | 0.467 | 0.065 | 0.075 | 0.131 | 0.293 | 0.174 | 0.238 | 1.000 |
data.skew()
Pregnancies 0.902 Glucose 0.174 BloodPressure -1.844 SkinThickness 0.109 Insulin 2.272 BMI -0.429 DiabetesPedigreeFunction 1.920 Age 1.130 Outcome 0.635 dtype: float64
import matplotlib.pyplot as plt
data.hist(figsize=(15,10))
plt.show()
data.plot(kind='density',subplots=True,layout=(3,3), sharex=False,figsize=(15,10))
plt.show()
data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False,figsize=(15,10))
plt.show()
import numpy
correlations = data.corr()
names = [name for name in data.columns]
# plot correlation matrix
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot()
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()
from pandas.plotting import scatter_matrix
scatter_matrix(data, figsize=(15,15),grid=True)
plt.show()